*******************************************************************************
*This file cleans IFR and KLEMS data. Adapted version of Pascual Restrepo, JPE*
*******************************************************************************
cap: program drop klems_recode
program klems_recode
	replace desc=ltrim(desc)
	replace desc=rtrim(desc)
	replace code=ltrim(code)
	replace code=rtrim(code)
	gen industry_ifr19=""
	replace industry_ifr19 = "agriculture" if inlist(code,"AtB")
	replace industry_ifr19 = "mining" if inlist(code,"C")
	replace industry_ifr19 = "food" if inlist(code,"15t16")
	replace industry_ifr19 = "textiles" if inlist(code,"17t19")
	replace industry_ifr19 = "furniture" if inlist(code,"20")
	replace industry_ifr19 = "paper" if inlist(code,"21t22")
	replace industry_ifr19 = "petrochemicals" if inlist(code,"23t25")
	replace industry_ifr19 = "mineral" if inlist(code,"26")
	replace industry_ifr19 = "metal_basic" if inlist(code,"27")
	replace industry_ifr19 = "metal_products" if inlist(code,"28")
	replace industry_ifr19 = "metal_machinery" if inlist(code,"29")
	replace industry_ifr19 = "electronics" if inlist(code,"30t33")
	replace industry_ifr19 = "automotive" if inlist(code,"34")
	replace industry_ifr19 = "vehicles_other" if inlist(code,"35")
	replace industry_ifr19 = "manufacturing_other" if inlist(code,"36t37")
	replace industry_ifr19 = "utilities" if inlist(code,"E")
	replace industry_ifr19 = "construction" if inlist(code,"F")
	replace industry_ifr19 = "services" if inlist(code,"G","H","I","JtK","L", "N", "O", "P", "Q")
	replace industry_ifr19 = "research" if inlist(code,"M")	
end

*List of variables to extract from KLEMS*
global agg_varlist "EMP EMPE H_EMP H_EMPE GO GO_QI"

*place holder for the data*
clear
gen country_klems=""
save "$clean_data_automation/klems_data_ifr19.dta", replace

***********************************************************************************************************************************************************************************
*Step 2: Loop over the countries used in the analysis: Denmark (dnk), Finland (fin), France (fra), Germany (ger), Italy (ita), Spain (esp), Sweden (swe), UK (uk), USA (usa-naics)*
***********************************************************************************************************************************************************************************	
foreach cname in ger dnk fin fra  ita esp swe uk usa-naics{

clear
set obs 19
gen industry_ifr19=""
replace industry_ifr19 = "agriculture" in 1
replace industry_ifr19 = "mining" in 2
replace industry_ifr19 = "food" in 3
replace industry_ifr19 = "textiles" in 4
replace industry_ifr19 = "furniture" in 5
replace industry_ifr19 = "paper" in 6
replace industry_ifr19 = "petrochemicals" in 7
replace industry_ifr19 = "mineral" in 8
replace industry_ifr19 = "metal_basic" in 9
replace industry_ifr19 = "metal_products" in 10
replace industry_ifr19 = "metal_machinery" in 11
replace industry_ifr19 = "electronics" in 12
replace industry_ifr19 = "automotive" in 13
replace industry_ifr19 = "vehicles_other" in 14
replace industry_ifr19 = "manufacturing_other" in 15
replace industry_ifr19 = "utilities" in 16
replace industry_ifr19 = "construction" in 17
replace industry_ifr19 = "services" in 18
replace industry_ifr19 = "research" in 19
gen country_klems="`cname'"
tempfile country_data
save `country_data', replace

foreach outcome in $agg_varlist {
		
		disp "---`outcome' for `cname'---"
		if "`cname'"!="usa-naics" {
		local filename "`cname'_output_09II.xls"
		}
		else {
		local filename "`cname'_output_07I.xls"
		}
		
		*import klems data from excel*
		import excel "$raw_data_industry/`filename'", sheet("`outcome'") firstrow clear
		
		*recode categories*
		klems_recode
		*verify that all replacements were made; otherwise do not save country*
		count if industry_ifr19!=""
		assert r(N)==27
		keep if industry_ifr19!=""
		rename _* `outcome'_*
		
		*aggregate data, first for employment, then for dollar measures, then for indices*
		if inlist("`outcome'", "EMP", "EMPE", "H_EMP", "H_EMPE")==1 {
		
		*Allocate 40% of employment in Miscellaneous manufacturing to Furniture*
		expand 2 if industry_ifr19 == "manufacturing_other"
		sort code
		foreach var of varlist `outcome'_*{
		bys code: replace `var'=0.4*`var' if industry_ifr19 == "manufacturing_other" & _n==1
		bys code: replace `var'=0.6*`var' if industry_ifr19 == "manufacturing_other" & _n==2
		}
		bys code: replace industry_ifr19="furniture" if industry_ifr19 == "manufacturing_other" & _n==1
		
		*Aggregate hours/employment*
		collapse (sum) `outcome'_*, by(industry_ifr19)
		
		}
		else if inlist("`outcome'", "GO")==1 {
		
		*Allocate 40% of income in Miscellaneous manufacturing to Furniture*
		expand 2 if industry_ifr19 == "manufacturing_other"
		sort code
		foreach var of varlist `outcome'_*{
		bys code: replace `var'=0.4*`var' if industry_ifr19 == "manufacturing_other" & _n==1
		bys code: replace `var'=0.6*`var' if industry_ifr19 == "manufacturing_other" & _n==2
		}
		bys code: replace industry_ifr19="furniture" if industry_ifr19 == "manufacturing_other" & _n==1
		
		*Aggregate and convert to constant units of local currency*
		collapse (sum) `outcome'_*, by(industry_ifr19)
		
		*National price index*
		preserve
		import excel "$raw_data_industry/`filename'", sheet("`outcome'_P") firstrow clear
		replace code=ltrim(code)
	    replace code=rtrim(code)
		keep if code=="TOT"
		rename _* Price_*
		gen id=1
		tempfile prices
		save `prices', replace
		restore
		
		gen id=1
		merge m:1 id using `prices', assert(3) nogenerate
		
		*Deflate using national price index*
		forvalues j=1970(1)2007{
		cap: gen `outcome'_real_`j'=100*`outcome'_`j'/Price_`j'
		}
		
		}
		else if strpos("`outcome'","_QI")!=0 {
		
		*Aggregate using shares of variable as weights*
		local outcome2=subinstr("`outcome'", "_QI", "", .)
		preserve
		import excel "$raw_data_industry/`filename'", sheet("`outcome2'") firstrow clear
		
		*recode categories*
		klems_recode
		*verify that all replacements were made; otherwise do not save country*
		count if industry_ifr19!=""
		assert r(N)==27
		keep if industry_ifr19!=""
		rename _* shares_*
		
		*Allocate 40% of income in Miscellaneous manufacturing to Furniture*
		expand 2 if industry_ifr19 == "manufacturing_other"
		sort code
		foreach var of varlist shares_*{
		bys code: replace `var'=0.4*`var' if industry_ifr19 == "manufacturing_other" & _n==1
		bys code: replace `var'=0.6*`var' if industry_ifr19 == "manufacturing_other" & _n==2
		}
		bys code: replace industry_ifr19="furniture" if industry_ifr19 == "manufacturing_other" & _n==1
		tempfile shares
		save `shares', replace
		restore
		
		*Generate space for allocation of Miscellaneous manufacturing*
		expand 2 if industry_ifr19 == "manufacturing_other"
		bys code: replace industry_ifr19="furniture" if industry_ifr19 == "manufacturing_other" & _n==1
		
		merge 1:1 code industry_ifr using `shares', assert(3) nogenerate
		
		*Aggregate indices*
		forvalues year=1970(1)2007{
		cap: bys industry_ifr: egen agg_`outcome'_`year'=total(`outcome'_`year'*shares_`year')
		cap: bys industry_ifr: egen agg_`year'=total((`outcome'_`year'!=.)*shares_`year')
		cap: replace `outcome'_`year'=agg_`outcome'_`year'/agg_`year'
		}
		bys industry_ifr19: keep if _n==_N
			
		}
		
		
		
		*merge and save
		merge 1:1 industry_ifr19 using  `country_data', assert(2 3) nogenerate
		order country_klems industry 
		save `country_data', replace
		}
		
append using "$clean_data_automation/klems_data_ifr19.dta"
save "$clean_data_automation/klems_data_ifr19.dta", replace
		
}

**************************************************************
*Step 3: Fill in country names and compute relevant variables*
**************************************************************
gen country=""
replace country="Denmark" if country_klems=="dnk"
replace country="Finland" if country_klems=="fin"
replace country="France" if country_klems=="fra"
replace country="Germany" if country_klems=="ger"
replace country="Italy" if country_klems=="ita"
replace country="Spain" if country_klems=="esp"
replace country="Sweden" if country_klems=="swe"
replace country="United Kingdom" if country_klems=="uk"
replace country="United States" if country_klems=="usa-naics"
save "$clean_data_automation/klems_data_ifr19.dta", replace

********************************************************************************************************************
*Step 4: Impute data for Norway/ needs manual imputations from other scandinavian countries appropriately re-scaled*
********************************************************************************************************************
use "$clean_data_automation/klems_data_ifr19.dta", clear
keep if country_klems=="dnk" | country_klems=="swe" | country_klems=="fin"
egen total_hours=total(H_EMP_1995)
sum total_hours
scalar adjustment_factor=(2000*1450/1000)/r(mean) /*hours worked in Norway relative to other scandinavian countries*/

*Aggregate variables*
foreach var in EMP EMPE H_EMP H_EMPE{
forvalues j=1970(1)2007{
bys industry_ifr19: replace `var'_`j'=`var'_`j'*adjustment_factor
}
}
collapse (sum) EMP_* EMPE_* H_EMP_* H_EMPE_* , by(industry_ifr)
gen country_klems="nor"
gen country="Norway"
append using "$clean_data_automation/klems_data_ifr19.dta"
save "$clean_data_automation/klems_data_ifr19.dta", replace

